Dataset¶

https://www.kaggle.com/datasets/faviovaz/marketing-ab-testing

Imports¶

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from scipy.stats import chi2_contingency, shapiro, levene, mannwhitneyu

import warnings 
warnings.filterwarnings("ignore")

# Color schemas
colorcategories = ['#A6C8E0', '#3182BD','#1D3B5D']
colorback = 'rgba(0,0,0,0)'
colortext = '#36414e'
fsize = 12

Exploratory analysis¶

In [6]:
# Read data
df = pd.read_csv("marketing_AB.csv")
In [7]:
# Check first rows
df.head()
Out[7]:
Unnamed: 0 user id test group converted total ads most ads day most ads hour
0 0 1069124 ad False 130 Monday 20
1 1 1119715 ad False 93 Tuesday 22
2 2 1144181 ad False 21 Tuesday 18
3 3 1435133 ad False 355 Tuesday 10
4 4 1015700 ad False 276 Friday 14
In [8]:
# Check null values and data types
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 588101 entries, 0 to 588100
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Unnamed: 0     588101 non-null  int64 
 1   user id        588101 non-null  int64 
 2   test group     588101 non-null  object
 3   converted      588101 non-null  bool  
 4   total ads      588101 non-null  int64 
 5   most ads day   588101 non-null  object
 6   most ads hour  588101 non-null  int64 
dtypes: bool(1), int64(4), object(2)
memory usage: 27.5+ MB
In [9]:
# Check if user id is unique
df['user id'].is_unique
Out[9]:
True
In [10]:
# Drop unwanted columns
df = df.drop(columns={"Unnamed: 0", "user id"})
In [11]:
df.columns
Out[11]:
Index(['test group', 'converted', 'total ads', 'most ads day',
       'most ads hour'],
      dtype='object')
In [12]:
# Create a dataframe with only categorical variables
df_cat = df[['test group', 'converted', 'most ads day', 'most ads hour']]
df_cat.nunique()
Out[12]:
test group        2
converted         2
most ads day      7
most ads hour    24
dtype: int64
In [13]:
for i in df_cat.columns:
    print(i, ':',df_cat[i].unique())
test group : ['ad' 'psa']
converted : [False  True]
most ads day : ['Monday' 'Tuesday' 'Friday' 'Saturday' 'Wednesday' 'Sunday' 'Thursday']
most ads hour : [20 22 18 10 14 13 19 11 12 16 21  3 23  4  8  0  2 15  1  6 17  7  9  5]
In [14]:
# Test group percentage
fig = px.histogram(df_cat, x ='test group', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='test group',
                  yaxis_title='%',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [15]:
# Test group number
fig = px.histogram(df_cat, x ='test group', color_discrete_sequence=colorcategories, text_auto='.0f')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='test group',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [16]:
# Converted percentage
fig = px.histogram(df_cat, x='converted', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='converted',
                  yaxis_title='%',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [17]:
# Test group number
fig = px.histogram(df_cat, x='converted', color_discrete_sequence=colorcategories, text_auto='.0f')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='converted',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [18]:
# Most ads day percentage
fig = px.histogram(df_cat, x ='most ads day', color_discrete_sequence=colorcategories, text_auto='.1f', histnorm='percent')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='most ads day',
                  yaxis_title='%',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [19]:
# Most ads day number
fig = px.histogram(df_cat, x='most ads day', color_discrete_sequence=colorcategories, text_auto='.0f')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='most ads day',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=600,
                  height=400,
                  showlegend=False)

fig.show()
In [20]:
# Most ads hour percentage
fig = px.histogram(df_cat, x='most ads hour', color_discrete_sequence= colorcategories, 
                   text_auto='.1f', histnorm='percent', barmode='group')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='most ads hour',
                  yaxis_title='%',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=1000,
                  height=400,
                  showlegend=False,
                  bargap=0.3)

fig.update_traces(textangle=0)

fig.show()
In [21]:
# Most ads day number
fig = px.histogram(df_cat, x='most ads hour', color_discrete_sequence=colorcategories, text_auto='.0f', barmode='group')

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='most ads hour',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=1000,
                  height=400,
                  showlegend=False, 
                  bargap=0.3)

fig.update_traces(textangle=-90)


fig.show()
In [22]:
# Total ads number
fig = px.histogram(df, x='total ads', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='total ads',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=1000,
                  height=400,
                  showlegend=False)



fig.show()
In [23]:
# Total ads boxplot
fig = px.box(df, y='total ads', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='',
                  yaxis_title='total ads',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=400,
                  height=1000,
                  showlegend=False)



fig.show()
In [24]:
df['total ads'].describe()
Out[24]:
count    588101.000000
mean         24.820876
std          43.715181
min           1.000000
25%           4.000000
50%          13.000000
75%          27.000000
max        2065.000000
Name: total ads, dtype: float64
In [25]:
# Total ads number (reduced x axis range)
fig = px.histogram(df, x='total ads', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='total ads',
                  yaxis_title='#',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=1000,
                  height=400,
                  xaxis_range=[0,50],
                  showlegend=False)



fig.show()
In [26]:
# Total ads boxplot (reduced y axis range)
fig = px.box(df, y='total ads', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='',
                  yaxis_title='total ads',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=400,
                  height=1000,
                  yaxis_range=[0,80],
                  showlegend=False)



fig.show()

Bivariate analysis¶

In [28]:
df.columns
Out[28]:
Index(['test group', 'converted', 'total ads', 'most ads day',
       'most ads hour'],
      dtype='object')
In [29]:
ct_conversion_test_group = pd.crosstab(df['test group'], df['converted'], normalize='index')
ct_conversion_test_group.sort_values(by=True, ascending=False)
Out[29]:
converted False True
test group
ad 0.974453 0.025547
psa 0.982146 0.017854
In [30]:
ct_conversion_most_ads_day = pd.crosstab(df['most ads day'], df['converted'], normalize='index')
ct_conversion_most_ads_day.sort_values(by=True, ascending=False)
Out[30]:
converted False True
most ads day
Monday 0.967188 0.032812
Tuesday 0.970160 0.029840
Wednesday 0.975058 0.024942
Sunday 0.975524 0.024476
Friday 0.977788 0.022212
Thursday 0.978429 0.021571
Saturday 0.978949 0.021051
In [31]:
ct_conversion_most_ads_hour = pd.crosstab(df['most ads hour'], df['converted'], normalize='index')
ct_conversion_most_ads_hour.sort_values(by=True, ascending=False)
Out[31]:
converted False True
most ads hour
16 0.969228 0.030772
20 0.970197 0.029803
15 0.970347 0.029653
21 0.971077 0.028923
17 0.971790 0.028210
14 0.971937 0.028063
18 0.972620 0.027380
19 0.973280 0.026720
22 0.973895 0.026105
13 0.975323 0.024677
12 0.976172 0.023828
23 0.977338 0.022662
6 0.977756 0.022244
11 0.977884 0.022116
10 0.978479 0.021521
5 0.979085 0.020915
8 0.980484 0.019516
9 0.980809 0.019191
0 0.981575 0.018425
7 0.981889 0.018111
4 0.984765 0.015235
1 0.987089 0.012911
3 0.989548 0.010452
2 0.992687 0.007313
In [32]:
# Total ads boxplot 
fig = px.box(df, y='total ads', x='converted', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='converted',
                  yaxis_title='total ads',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=400,
                  height=1000,
                  showlegend=False)



fig.show()
In [33]:
# Total ads boxplot (reduced y axis range)
fig = px.box(df, y='total ads', x='converted', color_discrete_sequence=colorcategories)

fig.update_layout(font_size=fsize,
                  font_color=colortext,
                  title='',
                  xaxis_title='converted',
                  yaxis_title='total ads',
                  paper_bgcolor=colorback,
                  plot_bgcolor=colorback,
                  width=400,
                  height=1000,
                  yaxis_range=[0,250],
                  showlegend=False)



fig.show()

Statistical test¶

In [35]:
df_cat.columns.drop('converted')
Out[35]:
Index(['test group', 'most ads day', 'most ads hour'], dtype='object')
In [36]:
for variable in df_cat.columns.drop('converted'):
    contingency_table=pd.crosstab(df[variable],df['converted'])

    #  Chi-Square Test
    chi2, p = chi2_contingency(contingency_table)[0:2]

    print(f"Variable: {variable}")
    print(f"Chi2 Statistic: {chi2}")
    print(f"P-value: {p}")
    
    # Decision
    if p < 0.05:
        print(f"Reject Null Hypothesis: {variable} and converted variables are dependent.")
        print("\n")
    else:
        print(f"Fail to Reject Null Hypothesis: {variable} and converted variable are independent.")
        print("\n")
Variable: test group
Chi2 Statistic: 54.005823883685245
P-value: 1.9989623063390078e-13
Reject Null Hypothesis: test group and converted variables are dependent.


Variable: most ads day
Chi2 Statistic: 410.0478857936585
P-value: 1.932184379244731e-85
Reject Null Hypothesis: most ads day and converted variables are dependent.


Variable: most ads hour
Chi2 Statistic: 430.76869230822086
P-value: 8.027629823696774e-77
Reject Null Hypothesis: most ads hour and converted variables are dependent.


In [37]:
# Check the normality of the distribution
shapiro_converted_stat,  shapiro_converted_p = shapiro(df[df['converted']==True]['total ads'])
print(f"Shapiro Statistic for converted: {shapiro_converted_stat}")
print(f"Shapiro p-value for converted: {shapiro_converted_p}")
if shapiro_converted_p <0.05:
    print("Reject H0: Data is not normally distributed.")
else:
    print("Fail to reject H0: Data is normally disctributed.")

shapiro_not_converted_stat,  shapiro_not_converted_p = shapiro(df[df['converted']==False]['total ads'])
print(f"Shapiro Statistic for not converted: {shapiro_not_converted_stat}")
print(f"Shapiro p-value for not converted: {shapiro_not_converted_p}")
if shapiro_not_converted_p <0.05:
    print("Reject H0: Data is not normally distributed.")
else:
    print("Fail to reject H0: Data is normally disctributed.")
Shapiro Statistic for converted: 0.6578396248200824
Shapiro p-value for converted: 1.638680987007771e-98
Reject H0: Data is not normally distributed.
Shapiro Statistic for not converted: 0.4746742488927551
Shapiro p-value for not converted: 9.883049430735801e-204
Reject H0: Data is not normally distributed.
In [ ]:
# Levene equality of variance test
levene_stat,  levene_p = levene(df[df['converted']==True]['total ads'], df[df['converted']==False]['total ads'])
print(f"Levene Statistic: {levene_stat}")
print(f"Levene p-value: {levene_p}")
if levene_p <0.05:
    print("Reject H0: Variances are different.")
else:
    print("Fail to reject H0: Variances are equal.")
Levene Statistic: 9121.196956737573
Levene p-value: 0.0
Reject H0: Variances are different.

Based on the above non-parametrical test needs to be used.

In [ ]:
# Mann-Whitney U test
mannwhitneyu_stat, mannwhitneyu_p = mannwhitneyu(df[df['converted']==True]['total ads'], df[df['converted']==False]['total ads'])
print(f"Mann-Whitney U Test Statistic: {mannwhitneyu_stat}")
print(f"Mann-Whitney U Test p-value: {mannwhitneyu_p}")
if mannwhitneyu_p <0.05:
    print("Reject H0: Significant difference between the two groups.")
else:
    print("Fail to reject H0: No significant difference between the two groups.")

Results¶

To sum up:

  • All of the variables have a statistically significant impact on the conversion rate.
  • Customers who saw the ad were more likely to convert.
  • Customers who saw ads on Monday, Wednesday were more likely to convert.
  • Customers who saw ads at 16 and 20 were more likely to convert.
  • Customer who saw more ads were more likely to convert.
In [ ]: